Import dataset

setwd("/Users/vitorpeixoto/Documents")
bank_marketing_data_full <- read.csv("bank-additional-full.csv", sep=";",header = TRUE)
summary(bank_marketing_data_full)
##       age                 job            marital     
##  Min.   :17.00   admin.     :10422   divorced: 4612  
##  1st Qu.:32.00   blue-collar: 9254   married :24928  
##  Median :38.00   technician : 6743   single  :11568  
##  Mean   :40.02   services   : 3969   unknown :   80  
##  3rd Qu.:47.00   management : 2924                   
##  Max.   :98.00   retired    : 1720                   
##                  (Other)    : 6156                   
##                education        default         housing     
##  university.degree  :12168   no     :32588   no     :18622  
##  high.school        : 9515   unknown: 8597   unknown:  990  
##  basic.9y           : 6045   yes    :    3   yes    :21576  
##  professional.course: 5243                                  
##  basic.4y           : 4176                                  
##  basic.6y           : 2292                                  
##  (Other)            : 1749                                  
##       loan            contact          month       day_of_week
##  no     :33950   cellular :26144   may    :13769   fri:7827   
##  unknown:  990   telephone:15044   jul    : 7174   mon:8514   
##  yes    : 6248                     aug    : 6178   thu:8623   
##                                    jun    : 5318   tue:8090   
##                                    nov    : 4101   wed:8134   
##                                    apr    : 2632              
##                                    (Other): 2016              
##     duration         campaign          pdays          previous    
##  Min.   :   0.0   Min.   : 1.000   Min.   :  0.0   Min.   :0.000  
##  1st Qu.: 102.0   1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.000  
##  Median : 180.0   Median : 2.000   Median :999.0   Median :0.000  
##  Mean   : 258.3   Mean   : 2.568   Mean   :962.5   Mean   :0.173  
##  3rd Qu.: 319.0   3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.000  
##  Max.   :4918.0   Max.   :56.000   Max.   :999.0   Max.   :7.000  
##                                                                   
##         poutcome      emp.var.rate      cons.price.idx  cons.conf.idx  
##  failure    : 4252   Min.   :-3.40000   Min.   :92.20   Min.   :-50.8  
##  nonexistent:35563   1st Qu.:-1.80000   1st Qu.:93.08   1st Qu.:-42.7  
##  success    : 1373   Median : 1.10000   Median :93.75   Median :-41.8  
##                      Mean   : 0.08189   Mean   :93.58   Mean   :-40.5  
##                      3rd Qu.: 1.40000   3rd Qu.:93.99   3rd Qu.:-36.4  
##                      Max.   : 1.40000   Max.   :94.77   Max.   :-26.9  
##                                                                        
##    euribor3m      nr.employed     y        
##  Min.   :0.634   Min.   :4964   no :36548  
##  1st Qu.:1.344   1st Qu.:5099   yes: 4640  
##  Median :4.857   Median :5191              
##  Mean   :3.621   Mean   :5167              
##  3rd Qu.:4.961   3rd Qu.:5228              
##  Max.   :5.045   Max.   :5228              
## 

Explorative Data Analysis

First of all, let’s take a look at our dataset.

dim(bank_marketing_data_full)
## [1] 41188    21
names(bank_marketing_data_full)
##  [1] "age"            "job"            "marital"        "education"     
##  [5] "default"        "housing"        "loan"           "contact"       
##  [9] "month"          "day_of_week"    "duration"       "campaign"      
## [13] "pdays"          "previous"       "poutcome"       "emp.var.rate"  
## [17] "cons.price.idx" "cons.conf.idx"  "euribor3m"      "nr.employed"   
## [21] "y"
head(bank_marketing_data_full)
##   age       job marital   education default housing loan   contact month
## 1  56 housemaid married    basic.4y      no      no   no telephone   may
## 2  57  services married high.school unknown      no   no telephone   may
## 3  37  services married high.school      no     yes   no telephone   may
## 4  40    admin. married    basic.6y      no      no   no telephone   may
## 5  56  services married high.school      no      no  yes telephone   may
## 6  45  services married    basic.9y unknown      no   no telephone   may
##   day_of_week duration campaign pdays previous    poutcome emp.var.rate
## 1         mon      261        1   999        0 nonexistent          1.1
## 2         mon      149        1   999        0 nonexistent          1.1
## 3         mon      226        1   999        0 nonexistent          1.1
## 4         mon      151        1   999        0 nonexistent          1.1
## 5         mon      307        1   999        0 nonexistent          1.1
## 6         mon      198        1   999        0 nonexistent          1.1
##   cons.price.idx cons.conf.idx euribor3m nr.employed  y
## 1         93.994         -36.4     4.857        5191 no
## 2         93.994         -36.4     4.857        5191 no
## 3         93.994         -36.4     4.857        5191 no
## 4         93.994         -36.4     4.857        5191 no
## 5         93.994         -36.4     4.857        5191 no
## 6         93.994         -36.4     4.857        5191 no

Then, let’s know better our variables by category.

split(names(bank_marketing_data_full),sapply(bank_marketing_data_full, function(x) paste(class(x), collapse=" ")))
## $factor
##  [1] "job"         "marital"     "education"   "default"     "housing"    
##  [6] "loan"        "contact"     "month"       "day_of_week" "poutcome"   
## [11] "y"          
## 
## $integer
## [1] "age"      "duration" "campaign" "pdays"    "previous"
## 
## $numeric
## [1] "emp.var.rate"   "cons.price.idx" "cons.conf.idx"  "euribor3m"     
## [5] "nr.employed"

Let’s now take a look at some of the individual variables. We look at the difference between mean, median and possible outliers. Some outliers might need to be fixed.

boxplot(bank_marketing_data_full$age, main="Age",
        yaxt="n", xlab="age", horizontal=TRUE,
        col=16)

barplot(table(bank_marketing_data_full$job), main="Job",
        col=16, las=2)

barplot(table(bank_marketing_data_full$marital), main="Marital",
        col=16, las=2)

barplot(table(bank_marketing_data_full$education), main="Education",
        col=16, las=2)

barplot(table(bank_marketing_data_full$default), main="Default",
        col=16, las=2)

barplot(table(bank_marketing_data_full$housing), main="Housing",
        col=16, las=2)

barplot(table(bank_marketing_data_full$loan), main="Loan",
        col=16, las=2)

barplot(table(bank_marketing_data_full$contact), main="Contact",
        col=16, las=2)

barplot(table(bank_marketing_data_full$month), main="Month",
        col=16, las=2)

barplot(table(bank_marketing_data_full$day_of_week), main="Day of Week",
        col=16, las=2)

boxplot(bank_marketing_data_full$duration, main="Duration",
        yaxt="n", xlab="duration", horizontal=TRUE,
        col=16)

boxplot(bank_marketing_data_full$campaign, main="Campaign",
        yaxt="n", xlab="campaign", horizontal=TRUE,
        col=16)

Duration and Campaign have some outliers, but analysing the dataset, we realised that all of them are important to the model and so, we shall keep them.

boxplot(bank_marketing_data_full$pdays, main="Days since last call (pdays)",
        yaxt="n", xlab="pdays", horizontal=TRUE,
        col=16)

Pdays has many outliers. This is due to the fact that if this is the first time calling the client, this variable is set as 999 so this creates a lot of 999 instances. This issue is gonna be solved later.

boxplot(bank_marketing_data_full$previous, main="Previous",
        yaxt="n", xlab="previous", horizontal=TRUE,
        col=16)

barplot(table(bank_marketing_data_full$poutcome), main="Previous outcome (poutcome)",
        col=16, las=2)

boxplot(bank_marketing_data_full$emp.var.rate, main="Employment variation rate (emp.var.rate)",
        yaxt="n", xlab="emp.var.rate", horizontal=TRUE,
        col=16)

boxplot(bank_marketing_data_full$cons.price.idx, main="Consumer price index (cons.price.idx)",
        yaxt="n", xlab="cons.price.idx", horizontal=TRUE,
        col=16)

boxplot(bank_marketing_data_full$cons.conf.idx, main="Consumer confidence index (cons.conf.idx)",
        yaxt="n", xlab="cons.conf.idx", horizontal=TRUE,
        col=16)

boxplot(bank_marketing_data_full$euribor3m, main="Euribor tax at 3 months (euribor3m)",
        yaxt="n", xlab="euribor3m", horizontal=TRUE,
        col=16)

boxplot(bank_marketing_data_full$nr.employed, main="Number of people employed (nr.employed)",
        yaxt="n", xlab="nr.employed", horizontal=TRUE,
        col=16)

barplot(table(bank_marketing_data_full$y), main="Result variable (y)",
        col=16, las=2)

Dealing with dataset problems

Pdays Outliers

As we talked earlier, the value ‘999’ in variable pdays means the client has not been previously contacted. That is a obvious outlier and should not be treated as numeric. Instead we must change all numbers to categories. Use ‘as.factor’ to do that. Also remove classes 25, 26 and 27 who have just one row of data and that can cause trouble when splitting into training and testing data.

bank_marketing_data_full$pdays <- factor(bank_marketing_data_full$pdays)
bank_marketing_data_full<-bank_marketing_data_full[!(bank_marketing_data_full$pdays==20 |
                                                     bank_marketing_data_full$pdays==25 |
                                                     bank_marketing_data_full$pdays==26 |
                                                     bank_marketing_data_full$pdays==27),]
barplot(table(bank_marketing_data_full$pdays), main="Days since last call (pdays)", col=16, las=2)

Missing data

Check missing values for all columns, assuming that ‘unknowns’ (not equal to NAs) are treated as missing values by changing those values to NA.

bank_marketing_data_full[bank_marketing_data_full=="unknown"] <- NA

Below we can see the plot with the missing data in red.

sapply(bank_marketing_data_full, function(x) sum(is.na(x)))
##            age            job        marital      education        default 
##              0            330             80           1730           8597 
##        housing           loan        contact          month    day_of_week 
##            990            990              0              0              0 
##       duration       campaign          pdays       previous       poutcome 
##              0              0              0              0              0 
##   emp.var.rate cons.price.idx  cons.conf.idx      euribor3m    nr.employed 
##              0              0              0              0              0 
##              y 
##              0
aggr_plot <- aggr(bank_marketing_data_full, col=c('blue','red'), numbers=TRUE, sortVars=TRUE, labels=names(bank_marketing_data_full), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

## 
##  Variables sorted by number of missings: 
##        Variable       Count
##         default 0.208746115
##       education 0.042006605
##         housing 0.024038462
##            loan 0.024038462
##             job 0.008012821
##         marital 0.001942502
##             age 0.000000000
##         contact 0.000000000
##           month 0.000000000
##     day_of_week 0.000000000
##        duration 0.000000000
##        campaign 0.000000000
##           pdays 0.000000000
##        previous 0.000000000
##        poutcome 0.000000000
##    emp.var.rate 0.000000000
##  cons.price.idx 0.000000000
##   cons.conf.idx 0.000000000
##       euribor3m 0.000000000
##     nr.employed 0.000000000
##               y 0.000000000

Since the dataset is so dense, we can afford to lose rows with missing data. The next step is to do so and generate again the plot of missing data, showing that no missing data exists anymore.

bank_marketing_data_full <- na.omit(bank_marketing_data_full)
aggr_plot <- aggr(bank_marketing_data_full, col=c('blue','red'), numbers=TRUE, sortVars=TRUE, labels=names(bank_marketing_data_full), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##        Variable Count
##             age     0
##             job     0
##         marital     0
##       education     0
##         default     0
##         housing     0
##            loan     0
##         contact     0
##           month     0
##     day_of_week     0
##        duration     0
##        campaign     0
##           pdays     0
##        previous     0
##        poutcome     0
##    emp.var.rate     0
##  cons.price.idx     0
##   cons.conf.idx     0
##       euribor3m     0
##     nr.employed     0
##               y     0

Imbalanced data

One of the main problems of imbalanced data in the result variable is that it generates a model overfitted to the major class. To correct this overfitting we need to balance the samples. This can be done using oversampling by creating new instances of the minor class. Below we show the ratio of yes and no in the result variable before oversampling.

counts <- table(bank_marketing_data_full$y)
barplot(counts,col=c("blue","red"),legend = rownames(counts), main = "Term Deposit")

Now we show the ratio of yes and no in the result variable after oversampling.

bank_marketing_data_full <- ovun.sample(y ~ ., data = bank_marketing_data_full, method = "over",N = 53000)$data

Now let’s observe the barplot of the result variable. It’s much more balanced and thus less prone to overfitting.

counts <- table(bank_marketing_data_full$y)
barplot(counts,col=c("blue","red"),legend = rownames(counts), main = "Term Deposit")

Logistic Regression

Generate the Model

Creating train and test datasets based on splitting the data in a 80/20 ratio.

set.seed(123)
sample = sample.split(bank_marketing_data_full,SplitRatio = 0.80)
train_data = subset(bank_marketing_data_full, sample==TRUE)
test_data =  subset(bank_marketing_data_full, sample==FALSE)

Logistic Regression model

model<-glm(y~.,data = train_data,family = binomial)
summary(model)
## 
## Call:
## glm(formula = y ~ ., family = binomial, data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7.7089  -0.3859  -0.1297   0.5053   3.0889  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -2.744e+02  2.773e+01  -9.895  < 2e-16 ***
## age                          -3.132e-03  1.900e-03  -1.648  0.09927 .  
## jobblue-collar               -2.879e-01  6.136e-02  -4.693 2.70e-06 ***
## jobentrepreneur              -1.419e-01  9.272e-02  -1.531  0.12588    
## jobhousemaid                  5.029e-03  1.168e-01   0.043  0.96564    
## jobmanagement                -1.081e-01  6.383e-02  -1.694  0.09032 .  
## jobretired                    5.098e-01  8.841e-02   5.767 8.07e-09 ***
## jobself-employed             -1.138e-01  8.612e-02  -1.322  0.18618    
## jobservices                  -1.620e-01  6.472e-02  -2.504  0.01229 *  
## jobstudent                    2.712e-01  9.336e-02   2.905  0.00367 ** 
## jobtechnician                 2.610e-02  5.305e-02   0.492  0.62279    
## jobunemployed                 2.566e-01  9.825e-02   2.611  0.00902 ** 
## maritalmarried               -2.687e-02  5.157e-02  -0.521  0.60228    
## maritalsingle                 8.353e-02  5.836e-02   1.431  0.15232    
## educationbasic.6y             3.331e-03  1.010e-01   0.033  0.97369    
## educationbasic.9y             4.321e-02  7.769e-02   0.556  0.57804    
## educationhigh.school          8.668e-02  7.505e-02   1.155  0.24809    
## educationilliterate           1.500e+00  6.590e-01   2.277  0.02281 *  
## educationprofessional.course  1.573e-01  8.186e-02   1.921  0.05469 .  
## educationuniversity.degree    3.122e-01  7.618e-02   4.099 4.16e-05 ***
## defaultyes                   -9.427e+00  2.293e+02  -0.041  0.96720    
## housingyes                    2.704e-02  3.134e-02   0.863  0.38827    
## loanyes                      -6.539e-02  4.264e-02  -1.533  0.12519    
## contacttelephone             -5.215e-01  5.629e-02  -9.264  < 2e-16 ***
## monthaug                      1.014e+00  9.756e-02  10.395  < 2e-16 ***
## monthdec                      2.314e-02  1.804e-01   0.128  0.89793    
## monthjul                     -1.555e-01  7.346e-02  -2.116  0.03434 *  
## monthjun                     -1.032e+00  9.185e-02 -11.231  < 2e-16 ***
## monthmar                      2.125e+00  1.145e-01  18.558  < 2e-16 ***
## monthmay                     -8.520e-01  6.095e-02 -13.980  < 2e-16 ***
## monthnov                     -7.714e-01  9.116e-02  -8.462  < 2e-16 ***
## monthoct                      2.518e-01  1.177e-01   2.139  0.03246 *  
## monthsep                      3.465e-01  1.376e-01   2.517  0.01183 *  
## day_of_weekmon                3.083e-02  5.042e-02   0.611  0.54088    
## day_of_weekthu                4.799e-02  4.989e-02   0.962  0.33606    
## day_of_weektue                7.524e-02  5.119e-02   1.470  0.14164    
## day_of_weekwed                2.288e-01  5.046e-02   4.534 5.78e-06 ***
## duration                      6.825e-03  8.060e-05  84.674  < 2e-16 ***
## campaign                     -3.681e-02  8.698e-03  -4.232 2.32e-05 ***
## pdays1                       -4.336e-01  7.491e-01  -0.579  0.56274    
## pdays2                        8.692e-01  6.552e-01   1.327  0.18460    
## pdays3                        9.633e-01  6.101e-01   1.579  0.11434    
## pdays4                       -6.259e-02  6.257e-01  -0.100  0.92032    
## pdays5                        1.538e-01  6.836e-01   0.225  0.82197    
## pdays6                        6.357e-01  6.110e-01   1.040  0.29820    
## pdays7                        6.669e-01  6.704e-01   0.995  0.31989    
## pdays8                       -3.867e-01  8.162e-01  -0.474  0.63566    
## pdays9                       -8.869e-02  6.586e-01  -0.135  0.89288    
## pdays10                       5.177e-01  6.808e-01   0.760  0.44699    
## pdays11                       3.112e-01  7.704e-01   0.404  0.68620    
## pdays12                      -1.070e-01  6.665e-01  -0.160  0.87251    
## pdays13                       1.057e+00  7.933e-01   1.332  0.18276    
## pdays14                      -3.182e-01  7.688e-01  -0.414  0.67897    
## pdays15                       1.428e+00  8.637e-01   1.653  0.09824 .  
## pdays16                      -6.934e-01  1.011e+00  -0.686  0.49279    
## pdays17                      -2.695e+00  1.041e+00  -2.590  0.00961 ** 
## pdays18                      -6.116e-01  1.058e+00  -0.578  0.56316    
## pdays19                      -2.023e+00  1.710e+00  -1.184  0.23655    
## pdays21                       1.045e+01  9.014e+01   0.116  0.90767    
## pdays22                       5.612e-01  1.202e+00   0.467  0.64051    
## pdays999                     -5.660e-01  6.430e-01  -0.880  0.37875    
## previous                     -7.591e-02  5.510e-02  -1.378  0.16832    
## poutcomenonexistent           4.753e-01  7.901e-02   6.016 1.79e-09 ***
## poutcomesuccess               7.268e-01  2.232e-01   3.257  0.00113 ** 
## emp.var.rate                 -2.313e+00  1.023e-01 -22.610  < 2e-16 ***
## cons.price.idx                2.577e+00  1.830e-01  14.081  < 2e-16 ***
## cons.conf.idx                 6.736e-03  6.318e-03   1.066  0.28634    
## euribor3m                     5.823e-01  1.021e-01   5.706 1.16e-08 ***
## nr.employed                   5.590e-03  2.289e-03   2.442  0.01461 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 55979  on 40380  degrees of freedom
## Residual deviance: 27160  on 40312  degrees of freedom
## AIC: 27298
## 
## Number of Fisher Scoring iterations: 11

Checking variable importance for GLM. We could use the R function cor or just plot it. But since we have too many categorical variables, let’s just analyse the p-values obtained in the model and plot them. Since we want p-values below 0.05, let’s invert them by subtracting to 1. This way we will get the highest scores to the most important variables and will only take those whose p-value is above 0.95.

pvalues <- 1-summary(model)$coefficients[,4]
pvalues <- pvalues[-1]
yyy <- as.list(rep(0.95,length(pvalues)))
bp=barplot(pvalues, main="Variable importance according to p-values",col=sample(colours(), 200), las=2, cex.names=0.6, cex.axis = 0.7, mgp = c(-1, -0, -1))
lines(x=bp,y=yyy,col="blue") 

We can compare both results and see if the variable importance according to the p-values checks with the variable correlation between the predictive variables and y.

pairs.panels(bank_marketing_data_full[,c(1:5,21)])

pairs.panels(bank_marketing_data_full[,c(6:10,21)])

pairs.panels(bank_marketing_data_full[,c(11:15,21)])

pairs.panels(bank_marketing_data_full[,c(16:20,21)])

Prediction for Logistic Regression model and its confusion matrix

test_result <- predict(model,test_data,type = "response")
test_result <- ifelse(test_result > 0.5,1,0)

test_result<-round(test_result,0)
test_result<-as.factor(test_result)
levels(test_result)<-c("no","yes")
actual1<-test_data[,21]
levels(actual1)<-c("no","yes")

conf1<-confusionMatrix(actual1,test_result,positive = "yes")
conf1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  5358  982
##        yes  795 5484
##                                          
##                Accuracy : 0.8592         
##                  95% CI : (0.853, 0.8652)
##     No Information Rate : 0.5124         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.7184         
##  Mcnemar's Test P-Value : 1.023e-05      
##                                          
##             Sensitivity : 0.8481         
##             Specificity : 0.8708         
##          Pos Pred Value : 0.8734         
##          Neg Pred Value : 0.8451         
##              Prevalence : 0.5124         
##          Detection Rate : 0.4346         
##    Detection Prevalence : 0.4976         
##       Balanced Accuracy : 0.8595         
##                                          
##        'Positive' Class : yes            
## 

ROC curve and respective Area Under Curve value.

roc <- roc.curve(test_data$y, test_result, plotit = F)
pr <- prediction(as.numeric(test_result), as.numeric(test_data$y))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

#Area under ROC curve
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8592489

Model for the most significative variables

Plot them

Select only most significative variables and plot them.

bank_marketing_data_sig <- bank_marketing_data_full[,c("job","education","contact","month","day_of_week","duration","campaign","poutcome","emp.var.rate","cons.price.idx","euribor3m","nr.employed","y")]
pairs.panels(bank_marketing_data_sig[,c(1:6,13)])

pairs.panels(bank_marketing_data_sig[,c(7:12,13)])

Generate the model

Logistic regression for the most significative variables

model_sig<-glm(y~job+education+contact+month+day_of_week+duration+campaign+poutcome+emp.var.rate+cons.price.idx+euribor3m+nr.employed, data = train_data,family = binomial)
summary(model_sig)
## 
## Call:
## glm(formula = y ~ job + education + contact + month + day_of_week + 
##     duration + campaign + poutcome + emp.var.rate + cons.price.idx + 
##     euribor3m + nr.employed, family = binomial, data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7.6972  -0.3878  -0.1303   0.5069   3.0727  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -2.578e+02  2.137e+01 -12.066  < 2e-16 ***
## jobblue-collar               -2.779e-01  6.102e-02  -4.553 5.28e-06 ***
## jobentrepreneur              -1.853e-01  9.219e-02  -2.010   0.0444 *  
## jobhousemaid                 -2.012e-02  1.160e-01  -0.173   0.8623    
## jobmanagement                -1.558e-01  6.299e-02  -2.474   0.0134 *  
## jobretired                    4.132e-01  7.675e-02   5.383 7.31e-08 ***
## jobself-employed             -1.191e-01  8.569e-02  -1.390   0.1644    
## jobservices                  -1.585e-01  6.432e-02  -2.464   0.0137 *  
## jobstudent                    3.722e-01  8.904e-02   4.180 2.92e-05 ***
## jobtechnician                 3.496e-02  5.275e-02   0.663   0.5075    
## jobunemployed                 2.700e-01  9.807e-02   2.753   0.0059 ** 
## educationbasic.6y             2.153e-02  9.997e-02   0.215   0.8295    
## educationbasic.9y             6.139e-02  7.677e-02   0.800   0.4239    
## educationhigh.school          1.239e-01  7.388e-02   1.678   0.0934 .  
## educationilliterate           1.542e+00  6.506e-01   2.371   0.0178 *  
## educationprofessional.course  1.862e-01  8.102e-02   2.298   0.0216 *  
## educationuniversity.degree    3.689e-01  7.457e-02   4.948 7.51e-07 ***
## contacttelephone             -5.083e-01  5.269e-02  -9.647  < 2e-16 ***
## monthaug                      1.024e+00  9.559e-02  10.714  < 2e-16 ***
## monthdec                     -4.433e-02  1.788e-01  -0.248   0.8042    
## monthjul                     -1.207e-01  7.195e-02  -1.678   0.0934 .  
## monthjun                     -9.856e-01  8.535e-02 -11.547  < 2e-16 ***
## monthmar                      2.107e+00  1.070e-01  19.697  < 2e-16 ***
## monthmay                     -8.412e-01  6.002e-02 -14.015  < 2e-16 ***
## monthnov                     -7.747e-01  8.762e-02  -8.842  < 2e-16 ***
## monthoct                      2.308e-01  1.151e-01   2.005   0.0450 *  
## monthsep                      3.271e-01  1.331e-01   2.458   0.0140 *  
## day_of_weekmon                2.801e-02  5.022e-02   0.558   0.5770    
## day_of_weekthu                4.941e-02  4.963e-02   0.996   0.3195    
## day_of_weektue                7.733e-02  5.089e-02   1.520   0.1286    
## day_of_weekwed                2.287e-01  5.022e-02   4.554 5.27e-06 ***
## duration                      6.818e-03  8.037e-05  84.831  < 2e-16 ***
## campaign                     -3.673e-02  8.697e-03  -4.223 2.41e-05 ***
## poutcomenonexistent           5.156e-01  4.641e-02  11.110  < 2e-16 ***
## poutcomesuccess               1.796e+00  7.459e-02  24.086  < 2e-16 ***
## emp.var.rate                 -2.312e+00  1.000e-01 -23.112  < 2e-16 ***
## cons.price.idx                2.479e+00  1.510e-01  16.419  < 2e-16 ***
## euribor3m                     6.667e-01  7.061e-02   9.442  < 2e-16 ***
## nr.employed                   3.903e-03  1.581e-03   2.468   0.0136 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 55979  on 40380  degrees of freedom
## Residual deviance: 27276  on 40342  degrees of freedom
## AIC: 27354
## 
## Number of Fisher Scoring iterations: 6

Prediction for Logistic Regression model and its confusion matrix

test_result_sig <- predict(model_sig,test_data,type = "response")
test_result_sig <- ifelse(test_result_sig > 0.5,1,0)

test_result_sig <- round(test_result_sig,0)
test_result_sig <- as.factor(test_result_sig)
levels(test_result_sig) <- c("no","yes")
actual2 <- test_data[,21]
levels(actual2) <- c("no","yes")

conf2 <- confusionMatrix(actual2,test_result_sig,positive = "yes")
conf2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  5359  981
##        yes  811 5468
##                                          
##                Accuracy : 0.858          
##                  95% CI : (0.8518, 0.864)
##     No Information Rate : 0.5111         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.716          
##  Mcnemar's Test P-Value : 6.545e-05      
##                                          
##             Sensitivity : 0.8479         
##             Specificity : 0.8686         
##          Pos Pred Value : 0.8708         
##          Neg Pred Value : 0.8453         
##              Prevalence : 0.5111         
##          Detection Rate : 0.4333         
##    Detection Prevalence : 0.4976         
##       Balanced Accuracy : 0.8582         
##                                          
##        'Positive' Class : yes            
## 

ROC curve and respective Area Under Curve value.

roc_sig <- roc.curve(test_data$y, test_result_sig, plotit = F)
pr_sig <- prediction(as.numeric(test_result_sig), as.numeric(test_data$y))
prf_sig <- performance(pr_sig, measure = "tpr", x.measure = "fpr")
plot(prf_sig)

#Area under ROC curve
auc_sig <- performance(pr_sig, measure = "auc")
auc_sig <- auc_sig@y.values[[1]]
auc_sig
## [1] 0.8580537